stopifnot(packageVersion("quanteda") >= "2.0")
stopifnot(packageVersion("LSX") >= "0.7.5")
require(quanteda)
require(stringi)
require(ggplot2)

quanteda_options(threads = 8)
seedwords <- dictionary(file = "seedwords.yml")
stopwords_en <- stopwords("en", "marimo")
stopwords_ja <- stopwords("ja", "marimo")

tokenize_english <- function(corp) {
    
    toks <- tokens(corp, remove_url = TRUE)
    toks <- tokens_select(toks, "^[0-9a-zA-Z\\-']+$", valuetype = 'regex', case_insensitive = FALSE,
                          padding = TRUE) # remove symbols
    
    min_count <- 100
    toks_cap <- 
        tokens_select(toks, '^[A-Z][a-zA-Z]+$', valuetype = 'regex', case_insensitive = FALSE,
                      padding = TRUE) %>% 
        tokens_remove(stopwords_en, padding = TRUE)
    seqs_cap <- textstat_collocations(toks_cap, min_count = min_count, method = "lambda", tolower = FALSE)
    toks <- tokens_compound(toks, seqs_cap[seqs_cap$z > 3], concatenator = ' ', join = TRUE)
    save(seqs_cap, file = "collocations_en.Rdata")
    return(toks)
}

tokenize_japanese <- function(corp) {
    
    toks <- tokens(corp, remove_url = TRUE)
    toks <- tokens_select(toks, "^[０-９ぁ-んァ-ヶー一-龠]+$", valuetype = 'regex', padding = TRUE)
    
    min_count <- 50
    toks_kanji <- tokens_select(toks, '^[一-龠]+$', valuetype = 'regex', padding = TRUE)
    seqs_kanji <- textstat_collocations(toks_kanji, min_count = min_count, method = "lambda", tolower = FALSE)
    toks <- tokens_compound(toks, seqs_kanji[seqs_kanji$z > 3,], valuetype = 'fixed', 
                            concatenator = '', join = TRUE)
    
    toks_kana <- tokens_select(toks, '^[ァ-ヶー]+$', valuetype = 'regex', padding = TRUE)
    seqs_kana <- textstat_collocations(toks_kana, min_count = min_count, method = "lambda", tolower = FALSE)
    toks <- tokens_compound(toks, seqs_kana[seqs_kana$z > 3,], valuetype = 'fixed', 
                            concatenator = '', join = TRUE)
    
    toks_any <- tokens_select(toks, '^[ァ-ヶー一-龠]+$', valuetype = 'regex', padding = TRUE)
    seqs_any <- textstat_collocations(toks_any, min_count = min_count, method = "lambda", tolower = FALSE)
    toks <- tokens_compound(toks, seqs_any[seqs_any$z > 3,], concatenator = '', join = TRUE)
    save(seqs_kanji, seqs_kana, seqs_any, file = "collocations_ja.Rdata")
    return(toks)
    
}

fix_japanese <- function(toks) {
    toks <- tokens_split(toks, "っ", valuetype = "fixed", remove_separator = FALSE)
    toks <- tokens_compound(toks, list(c("^[一-龠]$", "^っ$")), valuetype = "regex", concatenator = "")
    return(toks)
}

add_events <- function(x, bottom = TRUE) {
    event <- unlist(x)
    abline(v = as.Date(event), lty = 3)
    if (bottom) {
      text(as.Date(event), par("usr")[3] + (par("usr")[4] - par("usr")[3]) * 0.02, 
           names(event), srt = 90, adj = 0, pos = 4)
    } else {
      text(as.Date(event), par("usr")[4] - (par("usr")[4] - par("usr")[3]) * 0.02  , 
           names(event), srt = 90, adj = 0, pos = 2)
    }
  
}
